## Who's Cheating on Your Survey? --------------------------------
## A Detection Approach with Digital Trace Data -----------------
## Reproduction materials (Main text) ---------------------------

set.seed(1234)

# load packages
source("code/packages.r")

# load data
load("data/ger_question.RData")
load("data/ger_respondent.RData")
load("data/ger_dat_response.RData")

## Setting up log
my_log <- file("01-main-text-log.txt") # File name of output log
sink(my_log, append = TRUE, type = "output") # Writing console output to log file
sink(my_log, append = TRUE, type = "message")

cat(readChar(rstudioapi::getSourceEditorContext()$path, # Writing currently opened R script to file
             file.info(rstudioapi::getSourceEditorContext()$path)$size))

## -------------------------------------------------------------
## Figure 1. Distribution of instances of cheating -------------
## -------------------------------------------------------------

# Panel a. Respondent-level
respondent_level_bar <- ger_respondent %>% 
  dplyr::mutate(prop_cheated_on_cat = dplyr::case_when(resp_prop_cheat == 0 ~ "0",
                                                       dplyr::between(resp_prop_cheat, 0.001,0.1) ~ "1-10%",
                                                       dplyr::between(resp_prop_cheat, 0.101,0.2) ~ "11-20%",
                                                       dplyr::between(resp_prop_cheat, 0.201,0.3) ~ "21-30%",
                                                       dplyr::between(resp_prop_cheat, 0.301,0.4) ~ "31-40%",
                                                       dplyr::between(resp_prop_cheat, 0.401,0.5) ~ "41-50%",
                                                       dplyr::between(resp_prop_cheat, 0.501,0.6) ~ "51-60%",
                                                       dplyr::between(resp_prop_cheat, 0.601,0.7) ~ "61-70%",
                                                       dplyr::between(resp_prop_cheat, 0.701,0.8) ~ "71-80%",
                                                       dplyr::between(resp_prop_cheat, 0.801,0.9) ~ "81-90%",
                                                       dplyr::between(resp_prop_cheat, 0.901,1) ~ "91-100%")) %>%
  ggplot(., aes(x=prop_cheated_on_cat)) +
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  scale_y_continuous(labels= scales::percent) +
  theme_minimal(base_family = "Roboto Condensed") +
  labs(x = "Percentage of items cheated on",
       y = "Percentage of respondents") +
  theme(axis.text.x = element_text(size = 7))

respondent_level_bar

ggsave("figures/fig-01a-ger_cheating_distribution_count.png", respondent_level_bar, width = 10, height = 6, units = "cm")


# Panel b. Item-level
item_level_dotplot <- ger_question %>% 
  dplyr::mutate(prop_cheat_groups = round(ques_prop_cheat, 3)*100) %>%
  dplyr::group_by(prop_cheat_groups,item_label, ques_type_diff) %>%
  dplyr::summarize(n = dplyr::n_distinct(question)) %>%
  ggplot(aes(x = prop_cheat_groups, y= n, fill= ques_type_diff)) +
  geom_dotplot(stackgroups = TRUE, binwidth = 1/3, method = "histodot") +
  scale_y_continuous(breaks = c(0,0.325,0.67), labels = c(0,5,10)) +
  scale_fill_manual(values = c("#e66101", "#5e3c99", "#fdb863", "#b2abd2"), 
                    limits = c("Elite - Text", "Factual", "Elite - Pictures", "Event"),
                    labels = c("Elite - Verbal", "Factual", "Elite - Visual", "Event")) +
  theme_minimal(base_family = "Roboto Condensed") +
  theme(legend.position = "top",
        axis.ticks.y = element_line(),
        legend.title = element_blank(),
        legend.text = ggtext::element_markdown()) +
  labs(x = "Percentage of cheating instances by item",
       y = "Number of items")

item_level_dotplot

ggsave("figures/fig-01b-ger_cheating_by_ques_type_dotplot.png", item_level_dotplot, width = 10, height = 6, units = "cm")

## -------------------------------------------------------------
## Figure 2. Bayesian logistic mixed-effects model -------------
## -------------------------------------------------------------

## Modeling -----------

# defining covars of interest
respondent_covars.resc <- c("gender", "educ_cat", "resc.age.2", "resc.internal_efficacy",  "resc.resp_prop_correct_wo_cheating","twohoursweek")
question_covars.resc <- c("ques_type_diff", "resc.ques_prop_correct") 

# creating df for model
ger_dat_response_model <- na.omit(ger_dat_response[,c("cheat", "question", "personid", respondent_covars.resc, question_covars.resc)])
ger_dat_response_model <- labelled::unlabelled(ger_dat_response_model) %>% as.data.frame()

# setting up and running stan_glmer
options(mc.cores = parallel::detectCores() - 2) 
cheating_model = rstanarm::stan_glmer(as.numeric(cheat) ~ gender + educ_cat + resc.age.2 + resc.internal_efficacy + 
                                  resc.resp_prop_correct_wo_cheating + ques_type_diff + resc.ques_prop_correct + twohoursweek +
                                  (1 | question) + (1 | personid), family = binomial, data = ger_dat_response_model, sparse = TRUE, chains = 2, seed = 1234)
summary(cheating_model)

#save(cheating_model, file = "data/analysis/response_stan_cheating_model.RData")

## -------------------------------------------------------------
## The model will take some time to run. if you prefer to load --
## If you prefer to load the results, you can "uncomment" the --
## next line ---------------------------------------------------

#load("data/analysis/response_stan_cheating_model.RData") 

## Creating figures ----

# Coefficient plot

# Defining labels and order
label_list <- c("Gender: Female", "Education: Intermediate", "Education: High",
                "Age", "Internal efficacy", " Knowledge score", "Item type: Elite - Visual",
                "Item type: Elite - Verbal", "Item type: Factual", "Item difficulty", "Habitual surveytaker")

label_order <- c("Gender: Female", "Education: Intermediate", "Education: High",
                 "Age", "Internal efficacy", " Knowledge score", "Habitual surveytaker", "Item type: Elite - Visual",
                 "Item type: Elite - Verbal", "Item type: Factual", "Item difficulty")

# Extracting values
ci_model_stan <- cheating_model %>% 
  broom.mixed::tidy(conf.int = T) %>%
  dplyr::filter(term != "(Intercept)") %>%
  dplyr::mutate(pred_type = factor(ifelse(dplyr::row_number() <= 6 | dplyr::row_number() == 11, 1, 2)),
                conf_int = paste0("(",round(conf.low,3),";",round(conf.high,3), ")"),
                labels = factor(label_list, levels = label_order))

ci_stan_8 <- cheating_model %>% broom.mixed::tidy(conf.int = T, conf.level = 0.8) %>%
  dplyr::filter(term != "(Intercept)")

# FE estimates
pdf(file="figures/fig-02a-response_stan_habitual_ci.pdf", height=6, width=4, family="Helvetica")
ggplot(ci_model_stan, aes(x = estimate, y= reorder(labels, dplyr::desc(labels)), color=pred_type)) +
  geom_point(size = 2) +
  scale_x_continuous(limits = c(-4, 4)) +
  geom_text(aes(label = round(estimate,2)), position = position_nudge(y = 0.43), size = 3) +
  geom_vline(aes(xintercept = 0), linetype = "longdash", color = "black", size = 0.2) +
  geom_segment(aes(y = labels, yend = labels, x = conf.low, xend = conf.high), size = 0.4) + 
  geom_segment(aes(y = labels, yend = labels, x = ci_stan_8$conf.low, xend = ci_stan_8$conf.high), size = 0.8) +
  scale_color_manual(values = c("#e1021b", "#3d7db5")) +  
  sjPlot::theme_sjplot() +
  theme(legend.position = "none") +
  ggtitle("") +  
  labs(y ="",
       x= "Log odds") + 
  theme(plot.margin = unit(c(0,0.2,0,0), "cm")) #t,r,b,l
dev.off()


# Pred probabilities
covars <- c(paste0(respondent_covars.resc, " [all]"), paste0(question_covars.resc, " [all]"))
titles <- c("Gender", "Education", "Age", "Internal efficacy", "Knowledge score", "Habitual surveytaker", "Item type", "Item difficulty")
colors <- c(rep(rgb(219,0,23,max=255), length(respondent_covars.resc)), rep(rgb(44,105,169, max = 255), length(question_covars.resc)))
plot_list <- list()
scaleFUN <- function(x) sprintf("%.2f", x)
for(i in 1:length(covars)){
  p_pred <- ggeffects::ggpredict(cheating_model, terms = covars[i], ci.lvl = 0.95, condition = c(educ_cat = "Intermediate", gender = "Female", question = "know_candnames_cdsu.4", ques_type_diff = "Elite - Text", twohoursweek = "1")
  )
  plot_list[[i]] <- plot(p_pred) +
    labs(x = "",
         y = "",
         title = titles[i]) + 
    coord_cartesian(ylim=c(0, NA)) + 
    theme(plot.title = element_text(colour = colors[i], size = 9),
          plot.margin = unit(c(0.2,0.2,0.2,0), "cm")) #t,r,b,l
}

# function to unscale for plots
unscale <- function (x, dfvar) {return((x - mean(dfvar, na.rm = T))/(2*sd(dfvar, na.rm = T)))}

# Gender
plot_list[[1]] <- plot_list[[1]] + 
  scale_y_continuous(limits = c(0,0.025), breaks = c(0,0.01,0.02), labels = scales::percent_format(accuracy = 1))

# age
plot_list[[3]] <- plot_list[[3]] + 
  scale_x_continuous(breaks = unscale(c(25, 50, 75), dfvar = ger_dat_response$age.2), labels = c(25, 50, 75))

# efficacy
plot_list[[4]] <- plot_list[[4]] + 
  scale_x_continuous(breaks = unscale(c(-4,-2,0,2), dfvar = ger_dat_response$internal_efficacy), labels = c(-4,-2,0,2))

# knowledge score
plot_list[[5]] <- plot_list[[5]] + 
  scale_x_continuous(breaks = unscale(c(0, 0.25, 0.5, 0.75, 1), dfvar = ger_dat_response$resp_prop_correct_wo_cheating), labels = c(0, 0.25, 0.5, 0.75, 1))

# habitual surveytaker
plot_list[[6]] <- plot_list[[6]] + 
  scale_y_continuous(limits = c(0,0.025), breaks = c(0,0.01,0.02), labels = scales::percent_format(accuracy = 1)) 

# item type
plot_list[[7]] <- plot_list[[7]] +
  scale_x_continuous(labels = c("Event", "Elite-\nVisual", "Elite-\nVerbal", "Factual")) + 
  scale_y_continuous(limits = c(0,0.023), breaks = c(0,0.01,0.02), labels = scales::percent_format(accuracy = 1))

# item difficulty
plot_list[[8]] <- plot_list[[8]] +
  scale_x_continuous(breaks = unscale(c(0, 0.25, 0.5, 0.75, 1), dfvar = ger_dat_response$ques_prop_correct), labels = c(0, 0.25, 0.5, 0.75, 1)) + 
  scale_y_continuous(breaks = c(0,0.01,0.02), labels = scales::percent_format(accuracy = 1))

pdf(file="figures/fig-02b-response_stan_habitual_effects.pdf", height=8, width=5, family="Helvetica")
gridExtra::grid.arrange(grobs = plot_list, 
             ncol = 2,
             widths = c(1, 1),
             layout_matrix = rbind(c(1, 5),
                                   c(2, 6),
                                   c(3, 7),
                                   c(4, 8)))
dev.off()

closeAllConnections() # Close connection to log file